Data Processing

# Get crime data from ArcGIS API and remove (4) entries with missing geometry
# Write cleaned data to GeoJSON file
# crime_dg <- st_read("https://utility.arcgis.com/usrsvcs/servers/3adad6320b7a421bb3826ec8871e2b66/rest/services/OpenData_PublicSafety/MapServer/2/query?outFields=*&where=1%3D1&f=geojson")
# crime_dg$Date <- as.Date(fread(".//data//crime_dg.csv")$Date, "%Y/%m/%d")
# crime_dg <- crime_dg[!st_is_empty(crime_dg),]
# st_write(crime_dg, ".//data//crime_dg.geojson")

# Read in Hartford crime and parcel data
# c <- st_read(".//data//crime_dg.geojson")
# p <- st_read(".//data//parcel_hartford.geojson")

# Filter parcels for single family homes only
# Entire apartment or residential complexes are bought less frequently 
# and are more complicated to appraise.
# resid_labels <- c("ONE FAMILY") #, "TWO FAMILY", "THREE FAMILY")
# pr <- p[p$State_Use_Description %in% resid_labels, ]

# Filter crimes from 2016-2021
# Property appraisal was made in 2021.
# Assume that about 5 years of crime data is sufficient to capture the effect
# of crime on property values if any effect exists.
# cc <- c[as.Date(c$Date) >= as.Date("2016-01-01") & 
#           as.Date(c$Date) <= as.Date("2021-12-31"),]

# Save filtered data
# st_write(pr, ".//data//parcel_hartford_single_family.geojson")
# st_write(cc, ".//data//crime_hartford_2016_2021.geojson")
# Read in filtered Hartford crime, parcel, and population data
z <- st_read(".//data//crime_hartford_2016_2021.geojson")
## Reading layer `crime_hartford_2016_2021' from data source 
##   `C:\Users\llint\OneDrive - Yale University\classes\625\CT Property\SDS625\data\crime_hartford_2016_2021.geojson' 
##   using driver `GeoJSON'
## Simple feature collection with 197060 features and 12 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -72.71865 ymin: 41.72403 xmax: -72.65041 ymax: 41.80719
## Geodetic CRS:  WGS 84
p <- st_read(".//data//parcel_hartford_single_family.geojson")
## Reading layer `parcel_hartford_single_family' from data source 
##   `C:\Users\llint\OneDrive - Yale University\classes\625\CT Property\SDS625\data\parcel_hartford_single_family.geojson' 
##   using driver `GeoJSON'
## Simple feature collection with 7239 features and 47 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -72.71637 ymin: 41.72374 xmax: -72.65809 ymax: 41.80743
## Geodetic CRS:  WGS 84
pop <- read.csv(".//data//population_by_tract.csv", skip = 3, header = T)[,1:3]

# Get polygons for Hartford's census tracts
hartford_tracts <- st_filter(tracts(state = "CT"),
                             subset(county_subdivisions(state = "CT"), 
                                    NAMELSAD == "Hartford town"),
                             .predicate = st_within) |>
   st_transform(crs = st_crs(z))
## Retrieving data for the year 2021
## Retrieving data for the year 2021
# Get polygons for Hartford's bodies of water 
water <- st_intersection(
  st_transform(area_water("CT", "Hartford"), crs = st_crs(hartford_tracts)), 
  hartford_tracts)
## Retrieving data for the year 2021
## Warning: attribute variables are assumed to be spatially constant throughout
## all geometries
# Rename tract columns to distinguish from parcel and crime data
# hartford_tracts <- hartford_tracts %>%
#   rename(tract_geometry = geometry, tract_name = NAME, tract_id = GEOID)

# # Extract year from crime date
# c$year <- year(as.Date(c$Date))

# # Join parcel and crime data to census tracts (keep tract geometry)
# tp <- st_join(hartford_tracts, p)
# tc <- st_join(hartford_tracts, c)

# # Extract short tract name from in population data to match tract polygons
# pop$tract_name <- gsub("[^0-9.]", "", pop$Census.Tract)

# # Calculate average housing price by census tract
# ap <- tp %>%
#   group_by(tract_name) %>%
#   summarise(avg_house_value = mean(Assessed_Total, na.rm = TRUE))
# 
# # Calculate crime rate by census tract and year
# at <- tc[1:1000,] %>% 
#   group_by(tract_name, year) %>%
#   summarise(crime_count = n()) %>%
#   left_join(pop, by = "tract_name") %>%
#   mutate(crime_rate_per_1000 = crime_count / Estimated.Population * 1000) %>%
#   select(tract_name, year, crime_rate_per_1000) %>%
#   pivot_wider(names_from = year, values_from = crime_rate_per_1000) %>%
#   left_join(ap, by = "tract_name") %>%
#   select(tract_name, everything()) 
# 
#   write.csv(".//data//crime_rate_by_tract.csv", row.names = FALSE)
# 
# tc

Data Exploration

The filters applied to the data are:

The manually curated variables are:

The highly correlated numeric covariates are

There appear to be more thefts and violent crimes near single family homes in dilapidated condition. Homes in worse condition tend to be older, but there are also older homes in good condition.

There is not a clear relationship between the year the parcel was built and its condition.

All covariates appear to correlate with the response variable, Assessed_Total.

# install.packages("GGally")
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
# Filter major property crimes and violent crimes
stealin <- z[grep("ROBBERY|BURGLARY|LARCENY|THEFT", 
                   z$UCR_1_Category), ]
hurtin <- z[grep("ASSAULT|HOMICIDE|SHOOTING", z$UCR_1_Category), ]

# Get number of thefts and violent crimes within 0.1 miles of each parcel
p$thefts <- sapply(
  st_is_within_distance(p, stealin, dist = units::set_units(0.1, "mi")),  
  length)
p$violence <- sapply(
  st_is_within_distance(p, hurtin, dist = units::set_units(0.1, "mi")), 
  length)
# Re-order condition description of parcels
p$Condition_Description <- factor(
  as.character(p$Condition_Description), 
  levels = c("Dilapidated", "Very Poor", "Poor", "Fair", "Fair-Avg", "Average", 
             "Avg-Good", "Good", "Good-VG", "Very Good", "Excellent"))

# Select predictors and filter out NA's
X <- p[, c("OBJECTID", "Assessed_Total", "thefts", "violence", "Living_Area", 
           "Effective_Area", "AYB", "Number_of_Bedroom", "Number_of_Baths", 
           "Condition_Description")] %>%
  rename(Price = Assessed_Total, Thefts = thefts, Violence = violence,
         Year = AYB, Bed = Number_of_Bedroom, Bath = Number_of_Baths,
         Condition = Condition_Description) %>%
  na.omit()
# Check that we haven't dropped too many rows
nrow(p) # 7239 rows
## [1] 7239
nrow(X) # 7220 rows
## [1] 7220
# Plot pairwise correlation between numeric predictors
ggpairs(X, columns = c(2:9), lower = list(continuous = "points")) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Plot boxplots of numeric predictors with respect to condition description
# Pivot data longer so that we can facet by variable
LX <- X %>%
  pivot_longer(cols = c(Thefts, Violence, Living_Area, Effective_Area, 
                        Year, Bed, Bath, Price), 
               names_to = "variable", values_to = "value")
ggplot(data = LX, aes(x = variable, y = value)) + 
  geom_boxplot(aes(fill = Condition), position = position_dodge(1)) + 
  facet_wrap( ~ variable, scales = "free") + 
  theme(axis.title.x = element_blank(), axis.text.x = element_blank(),
        axis.ticks.x = element_blank(), panel.grid.major.x = element_blank(),
        axis.title.y = element_blank())

Analysis

Variable Selection. We leave out Thefts since it is highly correlated with Violence but has a lower correlation with Price than Violence does. For similar reasons, we leave out Effective_Area and keep Living_Area. We keep both Bed and Bath for now.

Transformations. From the correlation plots, we can see that Price, Living_Area, and Violence are right-skewed. To adjust for this, we take the log of Price and the square root of Living_Area and Violence.

# Fit linear models with no transformations
m0 <- lm(Price ~ Violence + Living_Area + Year + Bed + Bath + Condition, 
         data = X)
summary(m0)
## 
## Call:
## lm(formula = Price ~ Violence + Living_Area + Year + Bed + Bath + 
##     Condition, data = X)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -74690  -7114    855   6837 110641 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -2.440e+05  1.437e+04 -16.975  < 2e-16 ***
## Violence           -2.269e+02  6.194e+00 -36.632  < 2e-16 ***
## Living_Area         3.349e+01  3.247e-01 103.138  < 2e-16 ***
## Year                1.122e+02  6.642e+00  16.890  < 2e-16 ***
## Bed                -2.306e+03  2.287e+02 -10.086  < 2e-16 ***
## Bath                5.429e+03  3.547e+02  15.303  < 2e-16 ***
## ConditionVery Poor  7.343e+02  8.675e+03   0.085   0.9325    
## ConditionPoor       1.342e+04  6.797e+03   1.974   0.0484 *  
## ConditionFair       2.704e+04  6.392e+03   4.230 2.36e-05 ***
## ConditionFair-Avg   3.316e+04  6.243e+03   5.312 1.12e-07 ***
## ConditionAverage    4.205e+04  6.147e+03   6.841 8.51e-12 ***
## ConditionAvg-Good   4.959e+04  6.142e+03   8.073 7.97e-16 ***
## ConditionGood       5.046e+04  6.142e+03   8.215 2.49e-16 ***
## ConditionGood-VG    5.461e+04  6.161e+03   8.864  < 2e-16 ***
## ConditionVery Good  5.723e+04  6.164e+03   9.284  < 2e-16 ***
## ConditionExcellent  6.476e+04  6.314e+03  10.257  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13710 on 7204 degrees of freedom
## Multiple R-squared:  0.826,  Adjusted R-squared:  0.8256 
## F-statistic:  2280 on 15 and 7204 DF,  p-value: < 2.2e-16
# Plot diagnostics
par(mfrow = c(2, 2))
plot(m0)

# Fit linear models with transformations
m1 <- lm(log(Price) ~ sqrt(Violence) + sqrt(Living_Area) + Year + 
           Bed + Bath + Condition, 
         data = X)
summary(m1)
## 
## Call:
## lm(formula = log(Price) ~ sqrt(Violence) + sqrt(Living_Area) + 
##     Year + Bed + Bath + Condition, data = X)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.26043 -0.07630  0.01616  0.09684  0.72052 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         6.902e+00  1.769e-01  39.017  < 2e-16 ***
## sqrt(Violence)     -3.826e-02  8.649e-04 -44.240  < 2e-16 ***
## sqrt(Living_Area)   2.827e-02  3.796e-04  74.452  < 2e-16 ***
## Year                9.011e-04  8.068e-05  11.170  < 2e-16 ***
## Bed                -9.815e-03  2.813e-03  -3.489 0.000488 ***
## Bath                4.065e-02  4.154e-03   9.786  < 2e-16 ***
## ConditionVery Poor  7.608e-01  1.043e-01   7.294 3.33e-13 ***
## ConditionPoor       9.349e-01  8.171e-02  11.441  < 2e-16 ***
## ConditionFair       1.071e+00  7.686e-02  13.937  < 2e-16 ***
## ConditionFair-Avg   1.136e+00  7.507e-02  15.138  < 2e-16 ***
## ConditionAverage    1.381e+00  7.392e-02  18.684  < 2e-16 ***
## ConditionAvg-Good   1.503e+00  7.386e-02  20.354  < 2e-16 ***
## ConditionGood       1.520e+00  7.386e-02  20.587  < 2e-16 ***
## ConditionGood-VG    1.555e+00  7.409e-02  20.989  < 2e-16 ***
## ConditionVery Good  1.583e+00  7.412e-02  21.353  < 2e-16 ***
## ConditionExcellent  1.654e+00  7.592e-02  21.779  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1648 on 7204 degrees of freedom
## Multiple R-squared:  0.763,  Adjusted R-squared:  0.7625 
## F-statistic:  1546 on 15 and 7204 DF,  p-value: < 2.2e-16
plot(m1)

There are many parcels beyond 4 standard errors below the regression line. This means the model is severely overestimating the value of these parcels. Let’s find out what these parcels are.

Solution. Refer to the previous correlation plots and observe that Price and Living Area appear to have a strongly correlated linear correlation. However, we performed a log transformation on the Price variable but a square-root transformation on the Living Area variable. This asymmetry may be the cause of the large residuals.

# Investigate the large negative residuals
r <- residuals(m1) / summary(m1)$sigma # standard residuals
o <- X[r < -4, ]
o$StdResid <- r[r < -4]
nrow(o)
## [1] 31
# Plot the offending parcels geographically
g <- ggplot(o) +
  geom_sf(data = hartford_tracts) +
  geom_sf(data = o) +
  stat_sf_coordinates(size = 1, color = "red") +
  labs(x = "Latitude", y = "Longitude") +
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
toWebGL(ggplotly(g))
# Plot residuals vs. Condition
# Function for number of observations 
give.n <- function(x){
  return(data.frame(
    y = quantile(x, .75) + 0.1, 
    label = paste0("n = ", length(x))
    ))
}
ggplot(data = o, aes(x = Condition, y = StdResid)) + 
  geom_boxplot() + 
  stat_summary(fun.data = give.n, geom = "text", fun.y = median) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  theme_bw()

# Plot residuals vs transformed numeric covariates
o <- o %>%
  mutate(logPrice = log(Price), 
         sqrtViolence = sqrt(Violence),
         sqrtLiving_Area = sqrt(Living_Area))
ggpairs(o, columns = c(12:15,7:9), lower = list(continuous = "points"),
        progress = F) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Let’s test our hypothesis.

# Take log of living area
m2 <- lm(log(Price) ~ sqrt(Violence) + log(Living_Area) + Year + 
           Bed + Bath + Condition, 
         data = X)
summary(m2)
## 
## Call:
## lm(formula = log(Price) ~ sqrt(Violence) + log(Living_Area) + 
##     Year + Bed + Bath + Condition, data = X)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.26663 -0.08683  0.01414  0.10361  1.09631 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         4.1877985  0.2012699  20.807  < 2e-16 ***
## sqrt(Violence)     -0.0414237  0.0008987 -46.094  < 2e-16 ***
## log(Living_Area)    0.5556087  0.0082575  67.286  < 2e-16 ***
## Year                0.0007582  0.0000839   9.036  < 2e-16 ***
## Bed                -0.0010192  0.0029147  -0.350    0.727    
## Bath                0.0751240  0.0041665  18.031  < 2e-16 ***
## ConditionVery Poor  0.7690082  0.1087200   7.073 1.66e-12 ***
## ConditionPoor       0.9304490  0.0851789  10.923  < 2e-16 ***
## ConditionFair       1.0710818  0.0801190  13.369  < 2e-16 ***
## ConditionFair-Avg   1.1307146  0.0782497  14.450  < 2e-16 ***
## ConditionAverage    1.3693732  0.0770514  17.772  < 2e-16 ***
## ConditionAvg-Good   1.4886232  0.0769892  19.335  < 2e-16 ***
## ConditionGood       1.5035464  0.0769862  19.530  < 2e-16 ***
## ConditionGood-VG    1.5399516  0.0772256  19.941  < 2e-16 ***
## ConditionVery Good  1.5703838  0.0772606  20.326  < 2e-16 ***
## ConditionExcellent  1.6457653  0.0791454  20.794  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1718 on 7204 degrees of freedom
## Multiple R-squared:  0.7425, Adjusted R-squared:  0.7419 
## F-statistic:  1385 on 15 and 7204 DF,  p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(m2)

# Drop number of bedrooms
m3 <- lm(log(Price) ~ sqrt(Violence) + log(Living_Area) + Year + 
           Bath + Condition, 
         data = X)
summary(m3)
## 
## Call:
## lm(formula = log(Price) ~ sqrt(Violence) + log(Living_Area) + 
##     Year + Bath + Condition, data = X)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.26656 -0.08634  0.01404  0.10379  1.09043 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         4.195e+00  2.002e-01  20.950  < 2e-16 ***
## sqrt(Violence)     -4.145e-02  8.947e-04 -46.332  < 2e-16 ***
## log(Living_Area)    5.541e-01  6.987e-03  79.297  < 2e-16 ***
## Year                7.591e-04  8.385e-05   9.053  < 2e-16 ***
## Bath                7.486e-02  4.099e-03  18.266  < 2e-16 ***
## ConditionVery Poor  7.689e-01  1.087e-01   7.073 1.66e-12 ***
## ConditionPoor       9.300e-01  8.516e-02  10.920  < 2e-16 ***
## ConditionFair       1.071e+00  8.010e-02  13.365  < 2e-16 ***
## ConditionFair-Avg   1.130e+00  7.824e-02  14.447  < 2e-16 ***
## ConditionAverage    1.369e+00  7.704e-02  17.770  < 2e-16 ***
## ConditionAvg-Good   1.488e+00  7.697e-02  19.333  < 2e-16 ***
## ConditionGood       1.503e+00  7.697e-02  19.528  < 2e-16 ***
## ConditionGood-VG    1.539e+00  7.721e-02  19.939  < 2e-16 ***
## ConditionVery Good  1.570e+00  7.724e-02  20.324  < 2e-16 ***
## ConditionExcellent  1.645e+00  7.913e-02  20.793  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1718 on 7205 degrees of freedom
## Multiple R-squared:  0.7425, Adjusted R-squared:  0.742 
## F-statistic:  1484 on 14 and 7205 DF,  p-value: < 2.2e-16
plot(m3)

# Compare residuals beyond 4 SE's
r1 <- residuals(m1) / summary(m1)$sigma
sum(r1 < -4 | r1 > 4)
## [1] 33
r3 <- residuals(m3) / summary(m3)$sigma
sum(r3 < -4 | r3 > 4)
## [1] 21

Repeating the same residuals analysis from before, we see that there are no longer any strongly correlated covariates with the residuals.

# Investigate the large negative residuals
r <- residuals(m3) / summary(m3)$sigma # standard residuals
o <- X[r < -4, ]
o$StdResid <- r[r < -4]
nrow(o)
## [1] 17
# Plot the offending parcels geographically
g <- ggplot(o) +
  geom_sf(data = hartford_tracts) +
  geom_sf(data = o) +
  stat_sf_coordinates(size = 1, color = "red") +
  labs(x = "Latitude", y = "Longitude") +
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
toWebGL(ggplotly(g))
# Plot residuals vs. Condition
# Function for number of observations 
give.n <- function(x){
  return(data.frame(
    y = quantile(x, .75) + 0.1, 
    label = paste0("n = ", length(x))
    ))
}
ggplot(data = o, aes(x = Condition, y = StdResid)) + 
  geom_boxplot() + 
  stat_summary(fun.data = give.n, geom = "text", fun.y = median) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  theme_bw()

# Plot residuals vs transformed numeric covariates
o <- o %>%
  mutate(logPrice = log(Price), 
         sqrtViolence = sqrt(Violence),
         logLiving_Area = log(Living_Area))
ggpairs(o, columns = c(12:15,7,9), lower = list(continuous = "points"),
        progress = F) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Other stuff

# library(glmnet)

# # Format predictors and response variable from parcel data
# X <- p %>%
#   dplyr::select(Condition_Description, 
#                 AYB, Living_Area, Effective_Area, 
#                 Total_Rooms, Number_of_Bedroom, Number_of_Baths,
#                 thefts, violence) %>%
#   # mutate(sqrt_Living_Area = sqrt(Living_Area),
#   #        sqrt_Effective_Area = sqrt(Effective_Area),
#   #        thefts1 = thefts + 1,
#   #        thefts2 = thefts*2) %>%
#   sf::st_drop_geometry()
# X <- data.matrix(X)
# y <- p$Assessed_Total

# # Remove rows with missing values
# w <- complete.cases(X, y)
# X <- X[w, ]
# y <- y[w]

# # Select variables
# # Run lasso regression 
# cv_tune.lasso_model = suppressMessages(suppressWarnings(
#   cv.glmnet(x = X,
#             y = y,
#             nlambda = 1000,
#             nfolds = 500,
#             pmax = 15,
#             parallel = TRUE)))
# 
# plot(cv_tune.lasso_model)
# 
# lasso_modelmin = glmnet(x = X, y = y, lambda = cv_tune.lasso_model$lambda.min)
# lasso_model1se = glmnet(x = X, y = y, lambda = cv_tune.lasso_model$lambda.1se)
# 
# # Does not select Total_Rooms
# coef(lasso_modelmin) 
# # Does not select Total_Rooms, Number_of_Bedroom, thefts
# coef(lasso_model1se)
# 
# vars_keepmin = rownames(coef(lasso_modelmin))[
#   which(as.matrix(coef(lasso_modelmin)) != 0)][-1]
# vars_keep1se = rownames(coef(lasso_model1se))[
#   which(as.matrix(coef(lasso_model1se)) != 0)][-1]
# # Calculate centroids of parcels and crime
# centroids <- st_centroid(p)
# coords <- st_coordinates(centroids)
# p$centroid.x <- coords[, 'X']
# p$centroid.y <- coords[, 'Y']
# 
# centroids <- st_centroid(c)
# coords <- st_coordinates(centroids)
# c$centroid.x <- coords[, 'X']
# c$centroid.y <- coords[, 'Y']
# 
# 
# # Filter property crimes and violent crimes
# stealin <- c[grep("ROBBERY|BURGLARY|LARCENY|THEFT|STOLEN", 
#                    c$UCR_1_Category), ]
# hurtin <- c[grep("ASSAULT|HOMICIDE|SHOOTING", c$UCR_1_Category), ]
# 
# # Get number of thefts and violent crimes within 0.1 miles of each parcel
# p$thefts <- sapply(
#   st_is_within_distance(p, stealin, dist = units::set_units(0.1, "mi")),  
#   length)
# p$violence <- sapply(
#   st_is_within_distance(p, hurtin, dist = units::set_units(0.1, "mi")), 
#   length)
# 
# st_within(p$geometry[1], p$geometry[1])
# st_within(p$geometry[1], hartford_tracts[1])

Data Exploration

# # For each census tract, get average property value in 2021 and 
# # average annual crime count 
# # Create a new dataframe
# library(dplyr)
# # Add a column for year
# c <- c %>%
#   mutate(year = year(as.Date(Date)))
# # Drop geometry data 
# cc <- sf::st_drop_geometry(c)
# pp <- sf::st_drop_geometry(p)
# 
# # Calculate total crimes per year
# crimes_per_year <- table(cc$year)
# 
# 
# select(year) %>%
#   group_by(year) %>%
#   summarise(total_crime_per_yr = n())
# 
# 
# # Aggregate crime to census level to get crime rate
# # Plot census tracts colored by crime rates and sized by property values
# l1 = leaflet(c_tract) %>%
#   
#   addProviderTiles('CartoDB.Positron') %>%
#   
#   ## census tracts
#   addPolygons(fillColor = ~pal1(rescaled.house.value), 
#               label = ~label %>% lapply(htmltools::HTML), 
#               weight = 0.5,
#               color = 'black',
#               fillOpacity = 0.8) %>%
#   
#   # stops
#   addCircleMarkers(data = ds,
#                    lng = ~stop1_lon,
#                    lat = ~stop1_lat,
#                    label = ~label %>% lapply(htmltools::HTML),
#                    color = pubred,
#                    radius = ~log(`n_to_Grand Central` + `n_to_New Haven`)) %>%
#   
#   setView(lng = -73.3, 
#           lat = 41.21979, 
#           zoom = 10) %>%
#   
#   addTiles()
# # Plot sample of crimes and parcels
# 
# # library(mapview)
# # mapview(dplyr::sample_n(c, 1e3), dplyr::sample_n(p, 1e3))
# 
# g <- ggplot(dplyr::sample_n(c, 1e3)) +
#    geom_sf(data = hartford_tracts) +
#    geom_sf(data = dplyr::sample_n(p, 1e3)) +
#    geom_density_2d(aes(X,Y), data = ~cbind(.x, st_coordinates(.x))) +
#    stat_sf_coordinates(size = 0.1, color = "red") +
#    labs(x = "Latitude", y = "Longitude") +
#    theme_bw()
# 
# p <- toWebGL(ggplotly(g))
# p$x$data[[4]]$hoverinfo <- "none"
# p
# # Plot average residential parcel value by census tract 
# # with crime counts binned by area
# 
# # Map parcel address to 
# 
# 
# # Get average residential parcel value by census tract
# parcel_dg$Zone
# 
# hartford_tracts
# 
# parcel_dg$avg_residential_value <- parcel_dg$AV_LAND / parcel_dg$AV_TOTAL

References

Spatial regression

https://oerstatistics.wordpress.com/wp-content/uploads/2016/03/intro_to_r.pdf#page=68.08

https://crd230.github.io/lab8.html

Kernel density estimation

https://seeing-statistics.com/issue4/